Call Libraries
library(tidyverse)
library(caret)
library(MASS)
library(car)
library(moments)
Calling the Transformed Datasets
income_cleaned = read_csv('NYS_Corp_Tax_Credit_data/income_cleaned.csv')
Rows: 1921 Columns: 6── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (4): Year, Num, Amount, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
income_cleaned
industry_cleaned = read_csv('NYS_Corp_Tax_Credit_data/industry_cleaned.csv')
Rows: 2476 Columns: 6── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (4): Year, Num, Amount, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
industry_cleaned
Creating the Models
income.model <- lm(sat.formula, data = income_cleaned)
Error in eval(predvars, data, env) : object 'Avg.bc' not found
# linear.model.cleaned2 <- lm(Avg ~ . - Amount, data = income_cleaned)
# s = summary(linear.model.cleaned)
# show(s)
# plot(linear.model.cleaned)
#
# #histograms of response variable to check distribution
# hist(income_cleaned$Avg)
# hist(income_cleaned_bc$Avg.bc)
#
# #Shapiro-Wilks test to evaluate normality
# shapiro.test(income_cleaned$Avg)
# shapiro.test(income_cleaned_bc$Avg.bc)
#
# #Kurtosis evaluation (normal distribution has a value close to 3)
# moments::kurtosis(income_cleaned$Avg)
# moments::kurtosis(income_cleaned_bc$Avg.bc) #evaluated to 2.75 which means very close to 3 so it is normally distributed with possibly slightly less outliers.
Correcting violation of Normality in previous model with BoxCox transform
Checking linear regression assumptions for the transformed data.
#Industry
sat.model.summary(industry_cleaned_bc, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.9902, p-value = 5.826e-12
[1] 2.513097
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.2890 -0.4698 0.0171 0.4942 3.9076
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.143e+01 1.110e+01 -5.532 3.51e-08 ***
Year 3.464e-02 5.504e-03 6.294 3.67e-10 ***
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 2.667e-01 2.734e-01 0.976 0.329351
NameAlternative Minimum Tax Credit -2.461e+00 2.273e-01 -10.827 < 2e-16 ***
NameBeer Production Credit 6.344e-01 3.381e-01 1.876 0.060762 .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 2.199e+00 2.416e-01 9.101 < 2e-16 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.726e+00 3.930e-01 6.937 5.13e-12 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.587e+00 2.581e-01 6.149 9.11e-10 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 9.698e-01 2.571e-01 3.773 0.000165 ***
NameClean Heating Fuel Credit -2.544e+00 2.482e-01 -10.250 < 2e-16 ***
NameConservation Easement Tax Credit -1.122e+00 2.538e-01 -4.421 1.03e-05 ***
NameCredit for Employment of Persons with Disabilities -1.244e+00 2.738e-01 -4.546 5.74e-06 ***
NameCredit for Purchase of an Automated External Defibrillator -1.535e+00 2.411e-01 -6.364 2.34e-10 ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -8.142e-02 4.475e-01 -0.182 0.855647
NameEmpire State Apprentice Tax Credit -8.684e-01 5.464e-01 -1.589 0.112126
NameEmpire State Commercial Production Credit 6.241e-01 3.333e-01 1.872 0.061269 .
NameEmpire State Film Post Production Credit 1.439e+00 2.821e-01 5.100 3.65e-07 ***
NameEmpire State Film Production Credit 3.233e+00 2.584e-01 12.509 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 1.012e+00 4.866e-01 2.080 0.037610 *
NameExcelsior Jobs Program Credit 1.644e+00 2.372e-01 6.930 5.39e-12 ***
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.310e+00 2.332e-01 5.619 2.14e-08 ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 7.150e-01 2.275e-01 3.142 0.001696 **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.708e+00 2.264e-01 7.542 6.53e-14 ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 9.784e-01 2.328e-01 4.202 2.74e-05 ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 1.616e-01 2.304e-01 0.701 0.483154
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -5.234e-01 2.723e-01 -1.923 0.054647 .
NameFarm Workforce Retention Credit -8.624e-01 2.890e-01 -2.984 0.002876 **
NameFarmers' School Tax Credit -1.158e+00 2.772e-01 -4.176 3.07e-05 ***
NameHire a Veteran Credit -1.040e+00 4.457e-01 -2.333 0.019731 *
NameHistoric Properties Rehabilitation Credit 2.398e+00 2.614e-01 9.172 < 2e-16 ***
NameInvestment Tax Credit 6.997e-01 2.230e-01 3.138 0.001724 **
NameInvestment Tax Credit for the Financial Services Industry 1.561e+00 3.109e-01 5.020 5.53e-07 ***
NameLife Sciences Research & Development Tax Credit 9.053e-01 4.864e-01 1.861 0.062867 .
NameLong-Term Care Insurance Credit -1.866e+00 2.268e-01 -8.228 3.08e-16 ***
NameLow-Income Housing Credit 1.331e+00 2.967e-01 4.486 7.58e-06 ***
NameManufacturer\u0092s Real Property Tax Credit -6.077e-01 2.528e-01 -2.404 0.016286 *
NameManufacturer�s Real Property Tax Credit -7.363e-01 2.758e-01 -2.669 0.007652 **
NameMinimum Wage Reimbursement Credit -9.159e-01 2.366e-01 -3.871 0.000111 ***
NameMortgage Servicing Tax Credit 8.932e-01 3.490e-01 2.560 0.010540 *
NameNew York Youth Jobs Program Tax Credit 1.266e-01 2.307e-01 0.549 0.583231
NameQETC Capital Tax Credit 1.163e+00 3.169e-01 3.670 0.000248 ***
NameQETC Employment Credit -3.045e-01 2.412e-01 -1.263 0.206769
NameQETC Facilities, Operations, and Training Credit 1.148e+00 3.624e-01 3.168 0.001554 **
NameSpecial Additional Mortgage Recording Tax Credit 7.509e-01 2.394e-01 3.137 0.001728 **
NameSTART-UP NY Tax Elimination Credit -1.834e+00 2.565e-01 -7.150 1.15e-12 ***
GroupAdministrative and Support and Waste Management and Remediation Services 2.884e-01 1.360e-01 2.120 0.034078 *
GroupAdministrative/Support/Waste Management/Remediation Services 1.680e-01 1.511e-01 1.112 0.266251
GroupAgriculture, Forestry, Fishing and Hunting -1.135e-01 1.258e-01 -0.902 0.367038
GroupArts, Entertainment, and Recreation 5.595e-01 1.251e-01 4.472 8.11e-06 ***
GroupConstruction -4.326e-02 1.172e-01 -0.369 0.712181
GroupEducational Services 2.253e-01 1.592e-01 1.415 0.157097
GroupFinance and Insurance 5.350e-01 1.099e-01 4.870 1.19e-06 ***
GroupHealth Care and Social Assistance -6.769e-02 1.237e-01 -0.547 0.584224
GroupInformation 7.096e-01 1.176e-01 6.034 1.84e-09 ***
GroupManagement of Companies and Enterprises 6.670e-01 1.053e-01 6.337 2.79e-10 ***
GroupManufacturing 4.604e-01 1.095e-01 4.204 2.72e-05 ***
GroupMining 2.521e-01 1.940e-01 1.299 0.194080
GroupMining, Quarrying, and Oil and Gas Extraction 3.858e-01 1.626e-01 2.372 0.017774 *
GroupOther Services (except Public Administration) -1.549e-01 1.197e-01 -1.293 0.195989
GroupProfessional, Scientific, and Technical Services 4.877e-01 1.126e-01 4.332 1.54e-05 ***
GroupReal Estate and Rental and Leasing 1.464e-01 1.104e-01 1.326 0.184896
GroupRetail Trade 3.752e-01 1.100e-01 3.411 0.000657 ***
GroupTransportation and Warehousing 2.125e-01 1.242e-01 1.711 0.087224 .
GroupUtilities 6.979e-01 1.422e-01 4.908 9.83e-07 ***
GroupWholesale Trade 4.364e-01 1.124e-01 3.882 0.000106 ***
Num 4.555e-04 2.307e-04 1.974 0.048459 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8704 on 2410 degrees of freedom
Multiple R-squared: 0.7632, Adjusted R-squared: 0.7568
F-statistic: 119.5 on 65 and 2410 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.202893 1 1.484214
Name 5.218180 43 1.019397
Group 2.726612 20 1.025394
Num 1.371056 1 1.170921
Visualization and model stats
Stepwise Regression on Income_cat_bc (boxcox transformed dataset)
#creating dummy variable columns for stepwise
dummy_func <- function (df){
x = model.matrix(Avg.bc ~., df)[, -1]
dummy_bc = as.data.frame(x) %>% mutate(Avg.bc = df$Avg.bc)
colnames(dummy_bc) <- str_replace_all(colnames(dummy_bc), "-|'|/| |,|�" , '_')
return(dummy_bc)
}
Cleaning column names further so stepwise regression doesn’t present any errors
summary(forwardBIC[['income']])$coefficients
Estimate
(Intercept) -35.990882731
Group500_000_000___and_over 2.332162162
NameAlternative_Minimum_Tax_Credit -2.181442261
NameEmpire_State_Film_Production_Credit 2.706498239
NameClean_Heating_Fuel_Credit -3.225725322
NameLong_Term_Care_Insurance_Credit -3.009758493
NameCredit_for_Purchase_of_an_Automated_External_Defibrillator -3.083797646
NameCredit_for_Employment_of_Persons_with_Disabilities -3.263143617
Group100_000_000___499_999_999 1.687712446
NameIndustrial_or_Manufacturing_Business_Tax_Credit -1.859343201
NameConservation_Easement_Tax_Credit -2.286344505
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_6_23_08_but_before_7_1_15 1.567464241
NameHistoric_Properties_Rehabilitation_Credit 1.771621603
NameEZ_QEZE_Tax_Credits___EZ_Investment_Tax_Credit 0.743142678
NameEZ_QEZE_Tax_Credits___QEZE_Credit_for_Real_Property_Taxes 1.075600313
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___Prior_to_6_23_08 1.183854448
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_7_1_15 2.528124751
Group50_000_000___99_999_999 1.471106534
Group25_000_000___49_999_999 1.323850513
Group1_000_000___24_999_999 1.102078605
NameEmpire_State_Film_Post_Production_Credit 0.953736879
GroupZero_or_Net_Loss 1.009495036
NameSTART_UP_NY_Tax_Elimination_Credit -2.340948626
Num -0.001593991
NameNew_York_Youth_Jobs_Program_Tax_Credit -1.475123174
NameExcelsior_Jobs_Program_Credit 0.319231772
NameHire_a_Veteran_Credit -3.063449985
NameManufactureru0092s_Real_Property_Tax_Credit -1.542240689
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit_For_Corporate_Partners -1.506419640
NameFarmers__School_Tax_Credit -1.458275259
NameMinimum_Wage_Reimbursement_Credit -1.386166927
NameManufacturer_s_Real_Property_Tax_Credit -1.901589373
NameEmpire_State_Apprentice_Tax_Credit -2.354512875
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit -0.994257922
NameFarm_Workforce_Retention_Credit -1.758773498
Group500_000___999_999 0.606172216
NameMortgage_Servicing_Tax_Credit -1.084274186
NameLow_Income_Housing_Credit -1.053403737
Group100_000___499_999 0.362336938
NameQETC_Employment_Credit -0.737161626
NameAlternative_Fuels_and_Electric_Vehicle_Recharging_Property_Credit -1.014111072
Year 0.022774473
NameSpecial_Additional_Mortgage_Recording_Tax_Credit -0.375077530
Std. Error
(Intercept) 1.303392e+01
Group500_000_000___and_over 1.008232e-01
NameAlternative_Minimum_Tax_Credit 1.329270e-01
NameEmpire_State_Film_Production_Credit 1.611038e-01
NameClean_Heating_Fuel_Credit 1.730917e-01
NameLong_Term_Care_Insurance_Credit 1.550110e-01
NameCredit_for_Purchase_of_an_Automated_External_Defibrillator 1.492740e-01
NameCredit_for_Employment_of_Persons_with_Disabilities 2.075005e-01
Group100_000_000___499_999_999 1.030520e-01
NameIndustrial_or_Manufacturing_Business_Tax_Credit 1.693476e-01
NameConservation_Easement_Tax_Credit 1.892509e-01
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_6_23_08_but_before_7_1_15 1.610123e-01
NameHistoric_Properties_Rehabilitation_Credit 1.863273e-01
NameEZ_QEZE_Tax_Credits___EZ_Investment_Tax_Credit 9.108533e-02
NameEZ_QEZE_Tax_Credits___QEZE_Credit_for_Real_Property_Taxes 1.264744e-01
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___Prior_to_6_23_08 1.592699e-01
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_7_1_15 3.146055e-01
Group50_000_000___99_999_999 1.108276e-01
Group25_000_000___49_999_999 1.105266e-01
Group1_000_000___24_999_999 9.207831e-02
NameEmpire_State_Film_Post_Production_Credit 1.836029e-01
GroupZero_or_Net_Loss 8.787141e-02
NameSTART_UP_NY_Tax_Elimination_Credit 2.164849e-01
Num 2.185414e-04
NameNew_York_Youth_Jobs_Program_Tax_Credit 1.351498e-01
NameExcelsior_Jobs_Program_Credit 1.428132e-01
NameHire_a_Veteran_Credit 4.216004e-01
NameManufactureru0092s_Real_Property_Tax_Credit 1.778721e-01
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit_For_Corporate_Partners 1.801175e-01
NameFarmers__School_Tax_Credit 1.722154e-01
NameMinimum_Wage_Reimbursement_Credit 1.663310e-01
NameManufacturer_s_Real_Property_Tax_Credit 2.490134e-01
NameEmpire_State_Apprentice_Tax_Credit 3.468158e-01
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit 1.330842e-01
NameFarm_Workforce_Retention_Credit 2.712498e-01
Group500_000___999_999 1.017994e-01
NameMortgage_Servicing_Tax_Credit 1.987360e-01
NameLow_Income_Housing_Credit 2.086113e-01
Group100_000___499_999 9.429505e-02
NameQETC_Employment_Credit 1.742513e-01
NameAlternative_Fuels_and_Electric_Vehicle_Recharging_Property_Credit 2.545435e-01
Year 6.477265e-03
NameSpecial_Additional_Mortgage_Recording_Tax_Credit 1.275357e-01
t value
(Intercept) -2.761325
Group500_000_000___and_over 23.131215
NameAlternative_Minimum_Tax_Credit -16.410833
NameEmpire_State_Film_Production_Credit 16.799719
NameClean_Heating_Fuel_Credit -18.635932
NameLong_Term_Care_Insurance_Credit -19.416413
NameCredit_for_Purchase_of_an_Automated_External_Defibrillator -20.658644
NameCredit_for_Employment_of_Persons_with_Disabilities -15.725952
Group100_000_000___499_999_999 16.377290
NameIndustrial_or_Manufacturing_Business_Tax_Credit -10.979445
NameConservation_Easement_Tax_Credit -12.081023
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_6_23_08_but_before_7_1_15 9.735059
NameHistoric_Properties_Rehabilitation_Credit 9.508115
NameEZ_QEZE_Tax_Credits___EZ_Investment_Tax_Credit 8.158753
NameEZ_QEZE_Tax_Credits___QEZE_Credit_for_Real_Property_Taxes 8.504488
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___Prior_to_6_23_08 7.433007
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_7_1_15 8.035858
Group50_000_000___99_999_999 13.273831
Group25_000_000___49_999_999 11.977660
Group1_000_000___24_999_999 11.968927
NameEmpire_State_Film_Post_Production_Credit 5.194563
GroupZero_or_Net_Loss 11.488322
NameSTART_UP_NY_Tax_Elimination_Credit -10.813448
Num -7.293769
NameNew_York_Youth_Jobs_Program_Tax_Credit -10.914729
NameExcelsior_Jobs_Program_Credit 2.235310
NameHire_a_Veteran_Credit -7.266240
NameManufactureru0092s_Real_Property_Tax_Credit -8.670503
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit_For_Corporate_Partners -8.363541
NameFarmers__School_Tax_Credit -8.467739
NameMinimum_Wage_Reimbursement_Credit -8.333787
NameManufacturer_s_Real_Property_Tax_Credit -7.636495
NameEmpire_State_Apprentice_Tax_Credit -6.788943
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit -7.470893
NameFarm_Workforce_Retention_Credit -6.483963
Group500_000___999_999 5.954577
NameMortgage_Servicing_Tax_Credit -5.455852
NameLow_Income_Housing_Credit -5.049601
Group100_000___499_999 3.842587
NameQETC_Employment_Credit -4.230451
NameAlternative_Fuels_and_Electric_Vehicle_Recharging_Property_Credit -3.984039
Year 3.516063
NameSpecial_Additional_Mortgage_Recording_Tax_Credit -2.940961
Pr(>|t|)
(Intercept) 5.812749e-03
Group500_000_000___and_over 2.297059e-104
NameAlternative_Minimum_Tax_Credit 1.159347e-56
NameEmpire_State_Film_Production_Credit 4.073941e-59
NameClean_Heating_Fuel_Credit 2.949876e-71
NameLong_Term_Care_Insurance_Credit 1.117821e-76
NameCredit_for_Purchase_of_an_Automated_External_Defibrillator 1.322482e-85
NameCredit_for_Employment_of_Persons_with_Disabilities 1.913693e-52
Group100_000_000___499_999_999 1.878901e-56
NameIndustrial_or_Manufacturing_Business_Tax_Credit 3.165215e-27
NameConservation_Easement_Tax_Credit 2.048552e-32
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_6_23_08_but_before_7_1_15 6.963935e-22
NameHistoric_Properties_Rehabilitation_Credit 5.703747e-21
NameEZ_QEZE_Tax_Credits___EZ_Investment_Tax_Credit 6.128337e-16
NameEZ_QEZE_Tax_Credits___QEZE_Credit_for_Real_Property_Taxes 3.664849e-17
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___Prior_to_6_23_08 1.604289e-13
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_7_1_15 1.625998e-15
Group50_000_000___99_999_999 1.681295e-38
Group25_000_000___49_999_999 6.549761e-32
Group1_000_000___24_999_999 7.222780e-32
NameEmpire_State_Film_Post_Production_Credit 2.274726e-07
GroupZero_or_Net_Loss 1.432032e-29
NameSTART_UP_NY_Tax_Elimination_Credit 1.759331e-26
Num 4.423394e-13
NameNew_York_Youth_Jobs_Program_Tax_Credit 6.194355e-27
NameExcelsior_Jobs_Program_Credit 2.551416e-02
NameHire_a_Veteran_Credit 5.394423e-13
NameManufactureru0092s_Real_Property_Tax_Credit 9.129189e-18
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit_For_Corporate_Partners 1.170266e-16
NameFarmers__School_Tax_Credit 4.968842e-17
NameMinimum_Wage_Reimbursement_Credit 1.491953e-16
NameManufacturer_s_Real_Property_Tax_Credit 3.531386e-14
NameEmpire_State_Apprentice_Tax_Credit 1.509979e-11
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit 1.213726e-13
NameFarm_Workforce_Retention_Credit 1.138968e-10
Group500_000___999_999 3.104646e-09
NameMortgage_Servicing_Tax_Credit 5.519900e-08
NameLow_Income_Housing_Credit 4.855996e-07
Group100_000___499_999 1.257912e-04
NameQETC_Employment_Credit 2.444515e-05
NameAlternative_Fuels_and_Electric_Vehicle_Recharging_Property_Credit 7.034072e-05
Year 4.483930e-04
NameSpecial_Additional_Mortgage_Recording_Tax_Credit 3.311991e-03
Stepwise regression using BIC as the criteria (the penalty k = log(n)).
#checking our selected model's predictor variable's VIFs
vif(backwardBIC)
Year
1.661856
NameAlternative_Fuels_and_Electric_Vehicle_Recharging_Property_Credit
1.040066
NameAlternative_Minimum_Tax_Credit
1.355727
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_6_23_08_but_before_7_1_15
1.077139
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_7_1_15
1.032409
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___Prior_to_6_23_08
1.076724
NameClean_Heating_Fuel_Credit
1.086361
NameConservation_Easement_Tax_Credit
1.071241
NameCredit_for_Employment_of_Persons_with_Disabilities
1.050364
NameCredit_for_Purchase_of_an_Automated_External_Defibrillator
1.084252
NameEmpire_State_Apprentice_Tax_Credit
1.028286
NameEmpire_State_Film_Post_Production_Credit
1.067360
NameEmpire_State_Film_Production_Credit
1.079378
NameEZ_QEZE_Tax_Credits___EZ_Investment_Tax_Credit
1.203345
NameEZ_QEZE_Tax_Credits___QEZE_Credit_for_Real_Property_Taxes
1.110401
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit
1.094191
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit_For_Corporate_Partners
1.059267
NameFarm_Workforce_Retention_Credit
1.043336
NameFarmers__School_Tax_Credit
1.075009
NameHire_a_Veteran_Credit
1.016637
NameHistoric_Properties_Rehabilitation_Credit
1.066991
NameIndustrial_or_Manufacturing_Business_Tax_Credit
1.180919
NameLong_Term_Care_Insurance_Credit
1.085593
NameLow_Income_Housing_Credit
1.059690
NameManufactureru0092s_Real_Property_Tax_Credit
1.059324
NameManufacturer_s_Real_Property_Tax_Credit
1.051337
NameMinimum_Wage_Reimbursement_Credit
1.074669
NameMortgage_Servicing_Tax_Credit
1.068997
NameNew_York_Youth_Jobs_Program_Tax_Credit
1.109900
NameQETC_Employment_Credit
1.072612
NameSpecial_Additional_Mortgage_Recording_Tax_Credit
1.114775
NameSTART_UP_NY_Tax_Elimination_Credit
1.055977
ENI1_000_000___24_999_999
1.850053
ENI100_000___499_999
1.795816
ENI100_000_000___499_999_999
1.691776
ENI25_000_000___49_999_999
1.578815
ENI50_000_000___99_999_999
1.576866
ENI500_000___999_999
1.708105
ENI500_000_000___and_over
1.744570
ENIZero_or_Net_Loss
1.867768
Num
1.533625
Splitting data up into test data and training data (test data is for year 2019, training is the rest)
dim(X.test)
[1] 170 41
Lasso regression for comparison to backward stepwise
#create lambda grid
lambda.grid = 10^seq(2, -5, length = 100)
#create lasso models with lambda.grid
lasso.models = glmnet(X.train, y.train, alpha = 1, lambda = lambda.grid)
#visualize coefficient shrinkage
plot(lasso.models, xvar = "lambda", label = TRUE, main = "Lasso Regression")
#Cross Validation to find best lambda
set.seed(0)
cv.lasso.models <- cv.glmnet(X.train, y.train, alpha = 1, lambda = lambda.grid, nfolds = 10)
#visualize cross validation for lambda that minimizes the mean squared error.
plot(cv.lasso.models, main = "Lasso Regression")
#Checking the best lambda
log(cv.lasso.models$lambda.min)
best.lambda <- cv.lasso.models$lambda.min
best.lambda
# best lambda with all the variables was found to be 0.0006892612
# best lambda with only the bwdBIC coefficients included was found to be 0.0003053856
#looking at the lasso coefficients for the best.lambda
best.lambda.coeff <- predict(lasso.models, s = best.lambda, type = "coefficients")
best.lambda.coeff
#fitting a model with the best lambda found to be 0.000689 and using it to make predictions for the testing data.
lasso.best.lambda.train.pred <- predict(lasso.models, s = best.lambda, newx = X.test)
lasso.best.lambda.train.pred
#checking MSE
mean((lasso.best.lambda.train.pred - y.test)^2)
as.data.frame(lasso.best.lambda.train.pred) %>% mutate(Avg_in_dollars = (lasso.best.lambda.train.pred*lambda.bc+1)^(1/lambda.bc))